import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
% matplotlib inline
np.random.seed(42)
df = pd.read_csv('classroom_actions.csv')
df.head()
# The total_days represents the total amount of time
# each student has spent in classroom.
# get the average classroom time for control group
df.drop_duplicates(subset=['id'], inplace=True)
control_mean = df.query('group=="control"').total_days.mean()
# get the average classroom time for experiment group
experiment_mean = df.query('group=="experiment"').total_days.mean()
# display average classroom time for each group
control_mean, experiment_mean
# compute observed difference in classroom time
obs_diff = experiment_mean - control_mean
# display observed difference
obs_diff
# create sampling distribution of difference in average classroom times
# with boostrapping
diffs = []
for _ in range(10000):
sample = df.sample(len(df), replace=True)
sample_control_mean = sample.query('group=="control"').total_days.mean()
sample_experiment_mean = sample.query('group== "experiment"').total_days.mean()
diffs.append(sample_experiment_mean - sample_control_mean)
# convert to numpy array
diffs = np.array(diffs)
# plot sampling distribution
plt.hist(diffs)
# simulate distribution under the null hypothesis
null_vals =np.random.normal(0, diffs.std(), len(diffs))
# plot null distribution
plt.hist(null_vals)
# plot line for observed statistic
plt.axvline(diffs.mean(), color='r')
# compute p value
(null_vals > diffs.mean()).mean()